library(readr)
library(knitr)
library(caret)
## Warning: package 'caret' was built under R version 3.5.3
## Loading required package: lattice
## Loading required package: ggplot2
housing_data = read_csv("housing.csv")#first line of housing
## Parsed with column specification:
## cols(
## longitude = col_double(),
## latitude = col_double(),
## housing_median_age = col_double(),
## total_rooms = col_double(),
## total_bedrooms = col_double(),
## population = col_double(),
## households = col_double(),
## median_income = col_double(),
## median_house_value = col_double(),
## ocean_proximity = col_character()
## )
housing_data$median_house_value[1:100]
## [1] 452600 358500 352100 341300 342200 269700 299200 241400 226700 261100
## [11] 281500 241800 213500 191300 159200 140000 152500 155500 158700 162900
## [21] 147500 159800 113900 99700 132600 107500 93800 105500 108900 132000
## [31] 122300 115200 110400 104900 109700 97200 104500 103900 191400 176000
## [41] 155400 150000 118800 188800 184400 182300 142500 137500 187500 112500
## [51] 171900 93800 97500 104200 87500 83100 87500 85300 80300 60000
## [61] 75700 75000 86100 76100 73500 78400 84400 81300 85000 129200
## [71] 82500 95200 75000 67500 137500 177500 102100 108300 112500 131300
## [81] 162500 112500 112500 137500 118800 98200 118800 162500 137500 500001
## [91] 162500 137500 162500 187500 179200 130000 183800 125000 170000 193100
summary(housing_data)#gives us a summary of each column. Note that total bedrooms has 207 NA's. We will need to impute these values
## longitude latitude housing_median_age total_rooms
## Min. :-124.3 Min. :32.54 Min. : 1.00 Min. : 2
## 1st Qu.:-121.8 1st Qu.:33.93 1st Qu.:18.00 1st Qu.: 1448
## Median :-118.5 Median :34.26 Median :29.00 Median : 2127
## Mean :-119.6 Mean :35.63 Mean :28.64 Mean : 2636
## 3rd Qu.:-118.0 3rd Qu.:37.71 3rd Qu.:37.00 3rd Qu.: 3148
## Max. :-114.3 Max. :41.95 Max. :52.00 Max. :39320
##
## total_bedrooms population households median_income
## Min. : 1.0 Min. : 3 Min. : 1.0 Min. : 0.4999
## 1st Qu.: 296.0 1st Qu.: 787 1st Qu.: 280.0 1st Qu.: 2.5634
## Median : 435.0 Median : 1166 Median : 409.0 Median : 3.5348
## Mean : 537.9 Mean : 1425 Mean : 499.5 Mean : 3.8707
## 3rd Qu.: 647.0 3rd Qu.: 1725 3rd Qu.: 605.0 3rd Qu.: 4.7432
## Max. :6445.0 Max. :35682 Max. :6082.0 Max. :15.0001
## NA's :207
## median_house_value ocean_proximity
## Min. : 14999 Length:20640
## 1st Qu.:119600 Class :character
## Median :179700 Mode :character
## Mean :206856
## 3rd Qu.:264725
## Max. :500001
##
library(ggplot2)
#we want to look at shape of distribution to get a good idea of what to impute
ggplot(housing_data, aes(x = total_bedrooms)) +
geom_histogram(bins = 40) +
xlab("Total Bedrooms") +
ylab("Density") +
ggtitle("Histogram of Total Bedrooms (noncontinuous variable)")
## Warning: Removed 207 rows containing non-finite values (stat_bin).

#using mean for now
library(mice)
## Warning: package 'mice' was built under R version 3.5.3
##
## Attaching package: 'mice'
## The following objects are masked from 'package:base':
##
## cbind, rbind
housing_data_temp = mice(data = housing_data, m = 5, method = "mean", seed = 420)
##
## iter imp variable
## 1 1 total_bedrooms
## 1 2 total_bedrooms
## 1 3 total_bedrooms
## 1 4 total_bedrooms
## 1 5 total_bedrooms
## 2 1 total_bedrooms
## 2 2 total_bedrooms
## 2 3 total_bedrooms
## 2 4 total_bedrooms
## 2 5 total_bedrooms
## 3 1 total_bedrooms
## 3 2 total_bedrooms
## 3 3 total_bedrooms
## 3 4 total_bedrooms
## 3 5 total_bedrooms
## 4 1 total_bedrooms
## 4 2 total_bedrooms
## 4 3 total_bedrooms
## 4 4 total_bedrooms
## 4 5 total_bedrooms
## 5 1 total_bedrooms
## 5 2 total_bedrooms
## 5 3 total_bedrooms
## 5 4 total_bedrooms
## 5 5 total_bedrooms
## Warning: Number of logged events: 1
housing_data_full = complete(housing_data_temp, 1)
housing_data_nc = housing_data_full[, -10]#remove text variable for now
corrmatrix = cor(housing_data_nc)
kable(t(corrmatrix))
| longitude |
1.0000000 |
-0.9246644 |
-0.1081968 |
0.0445680 |
0.0692597 |
0.0997732 |
0.0553101 |
-0.0151759 |
-0.0459666 |
| latitude |
-0.9246644 |
1.0000000 |
0.0111727 |
-0.0360996 |
-0.0666584 |
-0.1087847 |
-0.0710354 |
-0.0798091 |
-0.1441603 |
| housing_median_age |
-0.1081968 |
0.0111727 |
1.0000000 |
-0.3612622 |
-0.3189983 |
-0.2962442 |
-0.3029160 |
-0.1190340 |
0.1056234 |
| total_rooms |
0.0445680 |
-0.0360996 |
-0.3612622 |
1.0000000 |
0.9272527 |
0.8571260 |
0.9184845 |
0.1980496 |
0.1341531 |
| total_bedrooms |
0.0692597 |
-0.0666584 |
-0.3189983 |
0.9272527 |
1.0000000 |
0.8739095 |
0.9747249 |
-0.0076819 |
0.0494535 |
| population |
0.0997732 |
-0.1087847 |
-0.2962442 |
0.8571260 |
0.8739095 |
1.0000000 |
0.9072223 |
0.0048343 |
-0.0246497 |
| households |
0.0553101 |
-0.0710354 |
-0.3029160 |
0.9184845 |
0.9747249 |
0.9072223 |
1.0000000 |
0.0130331 |
0.0658427 |
| median_income |
-0.0151759 |
-0.0798091 |
-0.1190340 |
0.1980496 |
-0.0076819 |
0.0048343 |
0.0130331 |
1.0000000 |
0.6880752 |
| median_house_value |
-0.0459666 |
-0.1441603 |
0.1056234 |
0.1341531 |
0.0494535 |
-0.0246497 |
0.0658427 |
0.6880752 |
1.0000000 |
highcorr = findCorrelation(corrmatrix, cutoff = .60)#this will give you highly correlated variables
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:readr':
##
## col_factor
library(RColorBrewer)
library(plotly)
## Warning: package 'plotly' was built under R version 3.5.3
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
plot_map = ggplot(housing_data_full,
aes(x = longitude, y = latitude, color = median_house_value, hma = housing_median_age,
tr = total_rooms, tb = total_bedrooms, hh = households, mi = median_income)) +
geom_point(aes(size = population), alpha = 0.4) +
xlab("Longitude") +
ylab("Latitude") +
ggtitle("Data Map - Longtitude vs Latitude and Associated Variables") +
theme(plot.title = element_text(hjust = 0.5)) +
scale_color_distiller(palette = "Paired", labels = comma) +
labs(color = "Median House Value (in $USD)", size = "Population")
plot_map_tt = ggplotly(plot_map)
plot_map_tt